package org.bouncycastle.pqc.crypto.sphincsplus;

import org.bouncycastle.util.Arrays;

/**
 * Haraka-512 v2, https://eprint.iacr.org/2016/098.pdf
 * <p>
 * Haraka512-256 with reference to Python Reference Impl from: https://github.com/sphincs/sphincsplus
 * </p>
 */
class HarakaSBase
{
    protected long[][] haraka512_rc = new long[][]{
        {0x24cf0ab9086f628bL, 0xbdd6eeecc83b8382L, 0xd96fb0306cdad0a7L, 0xaace082ac8f95f89L, 0x449d8e8870d7041fL, 0x49bb2f80b2b3e2f8L, 0x0569ae98d93bb258L, 0x23dc9691e7d6a4b1L},
        {0xd8ba10ede0fe5b6eL, 0x7ecf7dbe424c7b8eL, 0x6ea9949c6df62a31L, 0xbf3f3c97ec9c313eL, 0x241d03a196a1861eL, 0xead3a51116e5a2eaL, 0x77d479fcad9574e3L, 0x18657a1af894b7a0L},
        {0x10671e1a7f595522L, 0xd9a00ff675d28c7bL, 0x2f1edf0d2b9ba661L, 0xb8ff58b8e3de45f9L, 0xee29261da9865c02L, 0xd1532aa4b50bdf43L, 0x8bf858159b231bb1L, 0xdf17439d22d4f599L},
        {0xdd4b2f0870b918c0L, 0x757a81f3b39b1bb6L, 0x7a5c556898952e3fL, 0x7dd70a16d915d87aL, 0x3ae61971982b8301L, 0xc3ab319e030412beL, 0x17c0033ac094a8cbL, 0x5a0630fc1a8dc4efL},
        {0x17708988c1632f73L, 0xf92ddae090b44f4fL, 0x11ac0285c43aa314L, 0x509059941936b8baL, 0xd03e152fa2ce9b69L, 0x3fbcbcb63a32998bL, 0x6204696d692254f7L, 0x915542ed93ec59b4L},
        {0xf4ed94aa8879236eL, 0xff6cb41cd38e03c0L, 0x069b38602368aeabL, 0x669495b820f0ddbaL, 0xf42013b1b8bf9e3dL, 0xcf935efe6439734dL, 0xbc1dcf42ca29e3f8L, 0x7e6d3ed29f78ad67L},
        {0xf3b0f6837ffcddaaL, 0x3a76faef934ddf41L, 0xcec7ae583a9c8e35L, 0xe4dd18c68f0260afL, 0x2c0e5df1ad398eaaL, 0x478df5236ae22e8cL, 0xfb944c46fe865f39L, 0xaa48f82f028132baL},
        {0x231b9ae2b76aca77L, 0x292a76a712db0b40L, 0x5850625dc8134491L, 0x73137dd469810fb5L, 0x8a12a6a202a474fdL, 0xd36fd9daa78bdb80L, 0xb34c5e733505706fL, 0xbaf1cdca818d9d96L},
        {0x2e99781335e8c641L, 0xbddfe5cce47d560eL, 0xf74e9bf32e5e040cL, 0x1d7a709d65996be9L, 0x670df36a9cf66cddL, 0xd05ef84a176a2875L, 0x0f888e828cb1c44eL, 0x1a79e9c9727b052cL},
        {0x83497348628d84deL, 0x2e9387d51f22a754L, 0xb000068da2f852d6L, 0x378c9e1190fd6fe5L, 0x870027c316de7293L, 0xe51a9d4462e047bbL, 0x90ecf7f8c6251195L, 0x655953bfbed90a9cL},
    };

    protected int[][] haraka256_rc = new int[10][8];

    protected final byte[] buffer;
    protected int off;

    protected HarakaSBase()
    {
        this.buffer = new byte[64];
        off = 0;
    }

    protected void reset()
    {
        off = 0;
        Arrays.clear(buffer);
    }

    private void brRangeDec32Le(byte[] input, int[] output, int inputPos)
    {
        int tmp;
        for (int i = 0; i < output.length; ++i)
        {
            tmp = inputPos + (i << 2);
            output[i] = (input[tmp] & 0xFF) | ((input[tmp + 1] << 8) & 0xFF00) | (((int)input[tmp + 2] << 16) & 0xFF0000) | ((int)input[tmp + 3] << 24);
        }
    }

    protected void interleaveConstant(long[] output, byte[] input, int startPos)
    {
        int[] tmp_32_constant = new int[16];
        int i;
        brRangeDec32Le(input, tmp_32_constant, startPos);
        for (i = 0; i < 4; ++i)
        {
            brAesCt64InterleaveIn(output, i, tmp_32_constant, i << 2);
        }
        brAesCt64Ortho(output);
    }

    protected void interleaveConstant32(int[] output, byte[] input, int startPos)
    {
        for (int i = 0; i < 4; ++i)
        {
            output[i << 1] = brDec32Le(input, startPos + (i << 2));
            output[(i << 1) + 1] = brDec32Le(input, startPos + (i << 2) + 16);
        }
        brAesCtOrtho(output);
    }

    private int brDec32Le(byte[] input, int startPos)
    {
        return (input[startPos] & 0xFF) | ((input[startPos + 1] << 8) & 0xFF00) | (((int)input[startPos + 2] << 16) & 0xFF0000) | ((int)input[startPos + 3] << 24);
    }

    protected void haraka512Perm(byte[] output)
    {
        int[] w = new int[16];
        long[] q = new long[8];
        long tmp_q;
        int i, j;
        brRangeDec32Le(buffer, w, 0);
        for (i = 0; i < 4; ++i)
        {
            brAesCt64InterleaveIn(q, i, w, i << 2);
        }
        brAesCt64Ortho(q);
        for (i = 0; i < 5; ++i)
        {
            for (j = 0; j < 2; ++j)
            {
                brAesCt64BitsliceSbox(q);
                shiftRows(q);
                mixColumns(q);
                addRoundKey(q, haraka512_rc[(i << 1) + j]);
            }
            for (j = 0; j < 8; j++)
            {
                tmp_q = q[j];
                q[j] = (tmp_q & 0x0001000100010001L) << 5 |
                    (tmp_q & 0x0002000200020002L) << 12 |
                    (tmp_q & 0x0004000400040004L) >>> 1 |
                    (tmp_q & 0x0008000800080008L) << 6 |
                    (tmp_q & 0x0020002000200020L) << 9 |
                    (tmp_q & 0x0040004000400040L) >>> 4 |
                    (tmp_q & 0x0080008000800080L) << 3 |
                    (tmp_q & 0x2100210021002100L) >>> 5 |
                    (tmp_q & 0x0210021002100210L) << 2 |
                    (tmp_q & 0x0800080008000800L) << 4 |
                    (tmp_q & 0x1000100010001000L) >>> 12 |
                    (tmp_q & 0x4000400040004000L) >>> 10 |
                    (tmp_q & 0x8400840084008400L) >>> 3;
            }
        }
        brAesCt64Ortho(q);
        for (i = 0; i < 4; i++)
        {
            brAesCt64InterleaveOut(w, q, i);
        }
        for (i = 0; i < 16; ++i)
        {
            for (j = 0; j < 4; ++j)
            {
                output[(i << 2) + j] = (byte)((w[i] >>> (j << 3)) & 0xFF);
            }
        }
    }

    protected void haraka256Perm(byte[] output)
    {
        int[] q = new int[8];
        int tmp_q, i, j;
        interleaveConstant32(q, buffer, 0);
        for (i = 0; i < 5; ++i)
        {
            for (j = 0; j < 2; ++j)
            {
                brAesCtBitsliceSbox(q);
                shiftRows32(q);
                mixColumns32(q);
                addRoundKey32(q, haraka256_rc[(i << 1) + j]);
            }
            for (j = 0; j < 8; j++)
            {
                tmp_q = q[j];
                q[j] = (tmp_q & 0x81818181) |
                    (tmp_q & 0x02020202) << 1 |
                    (tmp_q & 0x04040404) << 2 |
                    (tmp_q & 0x08080808) << 3 |
                    (tmp_q & 0x10101010) >>> 3 |
                    (tmp_q & 0x20202020) >>> 2 |
                    (tmp_q & 0x40404040) >>> 1;
            }
        }
        brAesCtOrtho(q);
        for (i = 0; i < 4; i++)
        {
            brEnc32Le(output, q[i << 1], i << 2);
            brEnc32Le(output, q[(i << 1) + 1], (i << 2) + 16);
        }
    }

    private void brEnc32Le(byte[] dst, int x, int startPos)
    {
        for (int i = 0; i < 4; ++i)
        {
            dst[startPos + i] = (byte)(x >> (i << 3));
        }
    }

    private void brAesCt64InterleaveIn(long[] q, int qPos, int[] w, int startPos)
    {
        long x0, x1, x2, x3;
        x0 = w[startPos] & 0x00000000FFFFFFFFL;
        x1 = w[startPos + 1] & 0x00000000FFFFFFFFL;
        x2 = w[startPos + 2] & 0x00000000FFFFFFFFL;
        x3 = w[startPos + 3] & 0x00000000FFFFFFFFL;
        x0 |= x0 << 16;
        x1 |= x1 << 16;
        x2 |= x2 << 16;
        x3 |= x3 << 16;
        x0 &= 0x0000FFFF0000FFFFL;
        x1 &= 0x0000FFFF0000FFFFL;
        x2 &= 0x0000FFFF0000FFFFL;
        x3 &= 0x0000FFFF0000FFFFL;
        x0 |= x0 << 8;
        x1 |= x1 << 8;
        x2 |= x2 << 8;
        x3 |= x3 << 8;
        x0 &= 0x00FF00FF00FF00FFL;
        x1 &= 0x00FF00FF00FF00FFL;
        x2 &= 0x00FF00FF00FF00FFL;
        x3 &= 0x00FF00FF00FF00FFL;
        q[qPos] = x0 | (x2 << 8);
        q[qPos + 4] = x1 | (x3 << 8);
    }

    private static void brAesCtBitsliceSbox(int[] q)
    {
        /*
         * This S-box implementation is a straightforward translation of
         * the circuit described by Boyar and Peralta in "A new
         * combinational logic minimization technique with applications
         * to cryptology" (https://eprint.iacr.org/2009/191.pdf).
         *
         * Note that variables x* (input) and s* (output) are numbered
         * in "reverse" order (x0 is the high bit, x7 is the low bit).
         */

        int x0, x1, x2, x3, x4, x5, x6, x7;
        int y1, y2, y3, y4, y5, y6, y7, y8, y9;
        int y10, y11, y12, y13, y14, y15, y16, y17, y18, y19;
        int y20, y21;
        int z0, z1, z2, z3, z4, z5, z6, z7, z8, z9;
        int z10, z11, z12, z13, z14, z15, z16, z17;
        int t0, t1, t2, t3, t4, t5, t6, t7, t8, t9;
        int t10, t11, t12, t13, t14, t15, t16, t17, t18, t19;
        int t20, t21, t22, t23, t24, t25, t26, t27, t28, t29;
        int t30, t31, t32, t33, t34, t35, t36, t37, t38, t39;
        int t40, t41, t42, t43, t44, t45, t46, t47, t48, t49;
        int t50, t51, t52, t53, t54, t55, t56, t57, t58, t59;
        int t60, t61, t62, t63, t64, t65, t66, t67;
        int s0, s1, s2, s3, s4, s5, s6, s7;

        x0 = q[7];
        x1 = q[6];
        x2 = q[5];
        x3 = q[4];
        x4 = q[3];
        x5 = q[2];
        x6 = q[1];
        x7 = q[0];

        /*
         * Top linear transformation.
         */
        y14 = x3 ^ x5;
        y13 = x0 ^ x6;
        y9 = x0 ^ x3;
        y8 = x0 ^ x5;
        t0 = x1 ^ x2;
        y1 = t0 ^ x7;
        y4 = y1 ^ x3;
        y12 = y13 ^ y14;
        y2 = y1 ^ x0;
        y5 = y1 ^ x6;
        y3 = y5 ^ y8;
        t1 = x4 ^ y12;
        y15 = t1 ^ x5;
        y20 = t1 ^ x1;
        y6 = y15 ^ x7;
        y10 = y15 ^ t0;
        y11 = y20 ^ y9;
        y7 = x7 ^ y11;
        y17 = y10 ^ y11;
        y19 = y10 ^ y8;
        y16 = t0 ^ y11;
        y21 = y13 ^ y16;
        y18 = x0 ^ y16;

        /*
         * Non-linear section.
         */
        t2 = y12 & y15;
        t3 = y3 & y6;
        t4 = t3 ^ t2;
        t5 = y4 & x7;
        t6 = t5 ^ t2;
        t7 = y13 & y16;
        t8 = y5 & y1;
        t9 = t8 ^ t7;
        t10 = y2 & y7;
        t11 = t10 ^ t7;
        t12 = y9 & y11;
        t13 = y14 & y17;
        t14 = t13 ^ t12;
        t15 = y8 & y10;
        t16 = t15 ^ t12;
        t17 = t4 ^ t14;
        t18 = t6 ^ t16;
        t19 = t9 ^ t14;
        t20 = t11 ^ t16;
        t21 = t17 ^ y20;
        t22 = t18 ^ y19;
        t23 = t19 ^ y21;
        t24 = t20 ^ y18;

        t25 = t21 ^ t22;
        t26 = t21 & t23;
        t27 = t24 ^ t26;
        t28 = t25 & t27;
        t29 = t28 ^ t22;
        t30 = t23 ^ t24;
        t31 = t22 ^ t26;
        t32 = t31 & t30;
        t33 = t32 ^ t24;
        t34 = t23 ^ t33;
        t35 = t27 ^ t33;
        t36 = t24 & t35;
        t37 = t36 ^ t34;
        t38 = t27 ^ t36;
        t39 = t29 & t38;
        t40 = t25 ^ t39;

        t41 = t40 ^ t37;
        t42 = t29 ^ t33;
        t43 = t29 ^ t40;
        t44 = t33 ^ t37;
        t45 = t42 ^ t41;
        z0 = t44 & y15;
        z1 = t37 & y6;
        z2 = t33 & x7;
        z3 = t43 & y16;
        z4 = t40 & y1;
        z5 = t29 & y7;
        z6 = t42 & y11;
        z7 = t45 & y17;
        z8 = t41 & y10;
        z9 = t44 & y12;
        z10 = t37 & y3;
        z11 = t33 & y4;
        z12 = t43 & y13;
        z13 = t40 & y5;
        z14 = t29 & y2;
        z15 = t42 & y9;
        z16 = t45 & y14;
        z17 = t41 & y8;

        /*
         * Bottom linear transformation.
         */
        t46 = z15 ^ z16;
        t47 = z10 ^ z11;
        t48 = z5 ^ z13;
        t49 = z9 ^ z10;
        t50 = z2 ^ z12;
        t51 = z2 ^ z5;
        t52 = z7 ^ z8;
        t53 = z0 ^ z3;
        t54 = z6 ^ z7;
        t55 = z16 ^ z17;
        t56 = z12 ^ t48;
        t57 = t50 ^ t53;
        t58 = z4 ^ t46;
        t59 = z3 ^ t54;
        t60 = t46 ^ t57;
        t61 = z14 ^ t57;
        t62 = t52 ^ t58;
        t63 = t49 ^ t58;
        t64 = z4 ^ t59;
        t65 = t61 ^ t62;
        t66 = z1 ^ t63;
        s0 = t59 ^ t63;
        s6 = t56 ^ ~t62;
        s7 = t48 ^ ~t60;
        t67 = t64 ^ t65;
        s3 = t53 ^ t66;
        s4 = t51 ^ t66;
        s5 = t47 ^ t65;
        s1 = t64 ^ ~s3;
        s2 = t55 ^ ~t67;

        q[7] = s0;
        q[6] = s1;
        q[5] = s2;
        q[4] = s3;
        q[3] = s4;
        q[2] = s5;
        q[1] = s6;
        q[0] = s7;
    }

    private void shiftRows32(int[] q)
    {
        int x;
        for (int i = 0; i < 8; i++)
        {
            x = q[i];
            q[i] = (x & 0x000000FF)
                | ((x & 0x0000FC00) >>> 2) | ((x & 0x00000300) << 6)
                | ((x & 0x00F00000) >>> 4) | ((x & 0x000F0000) << 4)
                | ((x & 0xC0000000) >>> 6) | ((x & 0x3F000000) << 2);
        }
    }

    private void mixColumns32(int[] q)
    {
        int q0, q1, q2, q3, q4, q5, q6, q7;
        int r0, r1, r2, r3, r4, r5, r6, r7;

        q0 = q[0];
        q1 = q[1];
        q2 = q[2];
        q3 = q[3];
        q4 = q[4];
        q5 = q[5];
        q6 = q[6];
        q7 = q[7];
        r0 = (q0 >>> 8) | (q0 << 24);
        r1 = (q1 >>> 8) | (q1 << 24);
        r2 = (q2 >>> 8) | (q2 << 24);
        r3 = (q3 >>> 8) | (q3 << 24);
        r4 = (q4 >>> 8) | (q4 << 24);
        r5 = (q5 >>> 8) | (q5 << 24);
        r6 = (q6 >>> 8) | (q6 << 24);
        r7 = (q7 >>> 8) | (q7 << 24);

        q[0] = q7 ^ r7 ^ r0 ^ rotr16(q0 ^ r0);
        q[1] = q0 ^ r0 ^ q7 ^ r7 ^ r1 ^ rotr16(q1 ^ r1);
        q[2] = q1 ^ r1 ^ r2 ^ rotr16(q2 ^ r2);
        q[3] = q2 ^ r2 ^ q7 ^ r7 ^ r3 ^ rotr16(q3 ^ r3);
        q[4] = q3 ^ r3 ^ q7 ^ r7 ^ r4 ^ rotr16(q4 ^ r4);
        q[5] = q4 ^ r4 ^ r5 ^ rotr16(q5 ^ r5);
        q[6] = q5 ^ r5 ^ r6 ^ rotr16(q6 ^ r6);
        q[7] = q6 ^ r6 ^ r7 ^ rotr16(q7 ^ r7);
    }

    private void addRoundKey32(int[] q, int[] sk)
    {
        q[0] ^= sk[0];
        q[1] ^= sk[1];
        q[2] ^= sk[2];
        q[3] ^= sk[3];
        q[4] ^= sk[4];
        q[5] ^= sk[5];
        q[6] ^= sk[6];
        q[7] ^= sk[7];
    }

    private int rotr16(int x)
    {
        return (x << 16) | (x >>> 16);
    }

    private void brAesCt64Ortho(long[] q)
    {
        Swapn(q, 1, 0, 1);
        Swapn(q, 1, 2, 3);
        Swapn(q, 1, 4, 5);
        Swapn(q, 1, 6, 7);

        Swapn(q, 2, 0, 2);
        Swapn(q, 2, 1, 3);
        Swapn(q, 2, 4, 6);
        Swapn(q, 2, 5, 7);

        Swapn(q, 4, 0, 4);
        Swapn(q, 4, 1, 5);
        Swapn(q, 4, 2, 6);
        Swapn(q, 4, 3, 7);
    }

    private void brAesCtOrtho(int[] q)
    {
        Swapn32(q, 1, 0, 1);
        Swapn32(q, 1, 2, 3);
        Swapn32(q, 1, 4, 5);
        Swapn32(q, 1, 6, 7);

        Swapn32(q, 2, 0, 2);
        Swapn32(q, 2, 1, 3);
        Swapn32(q, 2, 4, 6);
        Swapn32(q, 2, 5, 7);

        Swapn32(q, 4, 0, 4);
        Swapn32(q, 4, 1, 5);
        Swapn32(q, 4, 2, 6);
        Swapn32(q, 4, 3, 7);
    }

    private void Swapn32(int[] q, int s, int pos1, int pos2)
    {
        int cl = 0, ch = 0;
        switch (s)
        {
        case 1:
            cl = 0x55555555;
            ch = 0xAAAAAAAA;
            break;
        case 2:
            cl = 0x33333333;
            ch = 0xCCCCCCCC;
            break;
        case 4:
            cl = 0x0F0F0F0F;
            ch = 0xF0F0F0F0;
            break;
        }
        int a = q[pos1], b = q[pos2];
        q[pos1] = (a & cl) | ((b & cl) << s);
        q[pos2] = ((a & ch) >>> s) | (b & ch);
    }

    private void Swapn(long[] q, int s, int pos1, int pos2)
    {
        long cl = 0, ch = 0;
        switch (s)
        {
        case 1:
            cl = 0x5555555555555555L;
            ch = 0xAAAAAAAAAAAAAAAAL;
            break;
        case 2:
            cl = 0x3333333333333333L;
            ch = 0xCCCCCCCCCCCCCCCCL;
            break;
        case 4:
            cl = 0x0F0F0F0F0F0F0F0FL;
            ch = 0xF0F0F0F0F0F0F0F0L;
            break;
        default:
            return;
        }
        long a = q[pos1], b = q[pos2];
        q[pos1] = (a & cl) | ((b & cl) << s);
        q[pos2] = ((a & ch) >>> s) | (b & ch);
    }

    private void brAesCt64BitsliceSbox(long[] q)
    {
        /*
         * This S-box implementation is a straightforward translation of
         * the circuit described by Boyar and Peralta in "A new
         * combinational logic minimization technique with applications
         * to cryptology" (https://eprint.iacr.org/2009/191.pdf).
         *
         * Note that variables x* (input) and s* (output) are numbered
         * in "reverse" order (x0 is the high bit, x7 is the low bit).
         */

        long x0, x1, x2, x3, x4, x5, x6, x7;
        long y1, y2, y3, y4, y5, y6, y7, y8, y9;
        long y10, y11, y12, y13, y14, y15, y16, y17, y18, y19;
        long y20, y21;
        long z0, z1, z2, z3, z4, z5, z6, z7, z8, z9;
        long z10, z11, z12, z13, z14, z15, z16, z17;
        long t0, t1, t2, t3, t4, t5, t6, t7, t8, t9;
        long t10, t11, t12, t13, t14, t15, t16, t17, t18, t19;
        long t20, t21, t22, t23, t24, t25, t26, t27, t28, t29;
        long t30, t31, t32, t33, t34, t35, t36, t37, t38, t39;
        long t40, t41, t42, t43, t44, t45, t46, t47, t48, t49;
        long t50, t51, t52, t53, t54, t55, t56, t57, t58, t59;
        long t60, t61, t62, t63, t64, t65, t66, t67;
        long s0, s1, s2, s3, s4, s5, s6, s7;

        x0 = q[7];
        x1 = q[6];
        x2 = q[5];
        x3 = q[4];
        x4 = q[3];
        x5 = q[2];
        x6 = q[1];
        x7 = q[0];

        /*
         * Top linear transformation.
         */
        y14 = x3 ^ x5;
        y13 = x0 ^ x6;
        y9 = x0 ^ x3;
        y8 = x0 ^ x5;
        t0 = x1 ^ x2;
        y1 = t0 ^ x7;
        y4 = y1 ^ x3;
        y12 = y13 ^ y14;
        y2 = y1 ^ x0;
        y5 = y1 ^ x6;
        y3 = y5 ^ y8;
        t1 = x4 ^ y12;
        y15 = t1 ^ x5;
        y20 = t1 ^ x1;
        y6 = y15 ^ x7;
        y10 = y15 ^ t0;
        y11 = y20 ^ y9;
        y7 = x7 ^ y11;
        y17 = y10 ^ y11;
        y19 = y10 ^ y8;
        y16 = t0 ^ y11;
        y21 = y13 ^ y16;
        y18 = x0 ^ y16;

        /*
         * Non-linear section.
         */
        t2 = y12 & y15;
        t3 = y3 & y6;
        t4 = t3 ^ t2;
        t5 = y4 & x7;
        t6 = t5 ^ t2;
        t7 = y13 & y16;
        t8 = y5 & y1;
        t9 = t8 ^ t7;
        t10 = y2 & y7;
        t11 = t10 ^ t7;
        t12 = y9 & y11;
        t13 = y14 & y17;
        t14 = t13 ^ t12;
        t15 = y8 & y10;
        t16 = t15 ^ t12;
        t17 = t4 ^ t14;
        t18 = t6 ^ t16;
        t19 = t9 ^ t14;
        t20 = t11 ^ t16;
        t21 = t17 ^ y20;
        t22 = t18 ^ y19;
        t23 = t19 ^ y21;
        t24 = t20 ^ y18;

        t25 = t21 ^ t22;
        t26 = t21 & t23;
        t27 = t24 ^ t26;
        t28 = t25 & t27;
        t29 = t28 ^ t22;
        t30 = t23 ^ t24;
        t31 = t22 ^ t26;
        t32 = t31 & t30;
        t33 = t32 ^ t24;
        t34 = t23 ^ t33;
        t35 = t27 ^ t33;
        t36 = t24 & t35;
        t37 = t36 ^ t34;
        t38 = t27 ^ t36;
        t39 = t29 & t38;
        t40 = t25 ^ t39;

        t41 = t40 ^ t37;
        t42 = t29 ^ t33;
        t43 = t29 ^ t40;
        t44 = t33 ^ t37;
        t45 = t42 ^ t41;
        z0 = t44 & y15;
        z1 = t37 & y6;
        z2 = t33 & x7;
        z3 = t43 & y16;
        z4 = t40 & y1;
        z5 = t29 & y7;
        z6 = t42 & y11;
        z7 = t45 & y17;
        z8 = t41 & y10;
        z9 = t44 & y12;
        z10 = t37 & y3;
        z11 = t33 & y4;
        z12 = t43 & y13;
        z13 = t40 & y5;
        z14 = t29 & y2;
        z15 = t42 & y9;
        z16 = t45 & y14;
        z17 = t41 & y8;

        /*
         * Bottom linear transformation.
         */
        t46 = z15 ^ z16;
        t47 = z10 ^ z11;
        t48 = z5 ^ z13;
        t49 = z9 ^ z10;
        t50 = z2 ^ z12;
        t51 = z2 ^ z5;
        t52 = z7 ^ z8;
        t53 = z0 ^ z3;
        t54 = z6 ^ z7;
        t55 = z16 ^ z17;
        t56 = z12 ^ t48;
        t57 = t50 ^ t53;
        t58 = z4 ^ t46;
        t59 = z3 ^ t54;
        t60 = t46 ^ t57;
        t61 = z14 ^ t57;
        t62 = t52 ^ t58;
        t63 = t49 ^ t58;
        t64 = z4 ^ t59;
        t65 = t61 ^ t62;
        t66 = z1 ^ t63;
        s0 = t59 ^ t63;
        s6 = t56 ^ ~t62;
        s7 = t48 ^ ~t60;
        t67 = t64 ^ t65;
        s3 = t53 ^ t66;
        s4 = t51 ^ t66;
        s5 = t47 ^ t65;
        s1 = t64 ^ ~s3;
        s2 = t55 ^ ~t67;

        q[7] = s0;
        q[6] = s1;
        q[5] = s2;
        q[4] = s3;
        q[3] = s4;
        q[2] = s5;
        q[1] = s6;
        q[0] = s7;
    }

    private void shiftRows(long[] q)
    {
        long x;
        for (int i = 0; i < q.length; i++)
        {
            x = q[i];
            q[i] = (x & 0x000000000000FFFFL)
                | ((x & 0x00000000FFF00000L) >>> 4)
                | ((x & 0x00000000000F0000L) << 12)
                | ((x & 0x0000FF0000000000L) >>> 8)
                | ((x & 0x000000FF00000000L) << 8)
                | ((x & 0xF000000000000000L) >>> 12)
                | ((x & 0x0FFF000000000000L) << 4);
        }
    }

    private void mixColumns(long[] q)
    {
        long q0, q1, q2, q3, q4, q5, q6, q7;
        long r0, r1, r2, r3, r4, r5, r6, r7;

        q0 = q[0];
        q1 = q[1];
        q2 = q[2];
        q3 = q[3];
        q4 = q[4];
        q5 = q[5];
        q6 = q[6];
        q7 = q[7];
        r0 = (q0 >>> 16) | (q0 << 48);
        r1 = (q1 >>> 16) | (q1 << 48);
        r2 = (q2 >>> 16) | (q2 << 48);
        r3 = (q3 >>> 16) | (q3 << 48);
        r4 = (q4 >>> 16) | (q4 << 48);
        r5 = (q5 >>> 16) | (q5 << 48);
        r6 = (q6 >>> 16) | (q6 << 48);
        r7 = (q7 >>> 16) | (q7 << 48);

        q[0] = q7 ^ r7 ^ r0 ^ rotr32(q0 ^ r0);
        q[1] = q0 ^ r0 ^ q7 ^ r7 ^ r1 ^ rotr32(q1 ^ r1);
        q[2] = q1 ^ r1 ^ r2 ^ rotr32(q2 ^ r2);
        q[3] = q2 ^ r2 ^ q7 ^ r7 ^ r3 ^ rotr32(q3 ^ r3);
        q[4] = q3 ^ r3 ^ q7 ^ r7 ^ r4 ^ rotr32(q4 ^ r4);
        q[5] = q4 ^ r4 ^ r5 ^ rotr32(q5 ^ r5);
        q[6] = q5 ^ r5 ^ r6 ^ rotr32(q6 ^ r6);
        q[7] = q6 ^ r6 ^ r7 ^ rotr32(q7 ^ r7);
    }

    private long rotr32(long x)
    {
        return (x << 32) | (x >>> 32);
    }

    private void addRoundKey(long[] q, long[] sk)
    {
        q[0] ^= sk[0];
        q[1] ^= sk[1];
        q[2] ^= sk[2];
        q[3] ^= sk[3];
        q[4] ^= sk[4];
        q[5] ^= sk[5];
        q[6] ^= sk[6];
        q[7] ^= sk[7];
    }

    private void brAesCt64InterleaveOut(int[] w, long[] q, int pos)
    {
        long x0, x1, x2, x3;

        x0 = q[pos] & 0x00FF00FF00FF00FFL;
        x1 = q[pos + 4] & 0x00FF00FF00FF00FFL;
        x2 = (q[pos] >>> 8) & 0x00FF00FF00FF00FFL;
        x3 = (q[pos + 4] >>> 8) & 0x00FF00FF00FF00FFL;
        x0 |= (x0 >>> 8);
        x1 |= (x1 >>> 8);
        x2 |= (x2 >>> 8);
        x3 |= (x3 >>> 8);
        x0 &= 0x0000FFFF0000FFFFL;
        x1 &= 0x0000FFFF0000FFFFL;
        x2 &= 0x0000FFFF0000FFFFL;
        x3 &= 0x0000FFFF0000FFFFL;
        pos <<= 2;
        w[pos] = (int)(x0 | (x0 >>> 16));
        w[pos + 1] = (int)(x1 | (x1 >>> 16));
        w[pos + 2] = (int)(x2 | (x2 >>> 16));
        w[pos + 3] = (int)(x3 | (x3 >>> 16));
    }

    protected static void xor(byte[] x, int xOff, byte[] y, int yOff, byte[] z, int zOff, int zLen)
    {
        for (int i = 0; i < zLen; i++)
        {
            z[zOff + i] = (byte)(x[xOff + i] ^ y[yOff + i]);
        }
    }
}
